suppressPackageStartupMessages({
library(data.table)
library(DESeq2)
library(gplots)
library(here)
library(hyperSpec)
library(parallel)
library(RColorBrewer)
library(tidyverse)
library(UpSetR)
library(VennDiagram)
})
suppressMessages({
source(here("UPSCb-common/Rtoolbox/src/plotEnrichedTreemap.R"))
source(here("UPSCb-common/src/R/featureSelection.R"))
source(here("UPSCb-common/src/R/volcanoPlot.R"))
source(here("UPSCb-common/src/R/gopher.R"))
})
pal=brewer.pal(8,"Dark2")
hpal <- colorRampPalette(c("blue","white","red"))(100)
mar <- par("mar")
threads=1
"line_plot" <- function(dds=dds,vst=vst,gene_id=gene_id){
message(paste("Plotting",gene_id))
sel <- grepl(gene_id,rownames(vst))
stopifnot(sum(sel)==1)
p <- ggplot(bind_cols(as.data.frame(colData(dds)),
data.frame(value=vst[sel,])),
aes(x=Time,y=value,col=Time,group=Time)) +
geom_point() + geom_smooth() +
scale_y_continuous(name="VST expression") +
ggtitle(label=paste("Expression for: ",gene_id))
suppressMessages(suppressWarnings(plot(p)))
return(NULL)
}
"extract_results" <- function(dds,vst,contrast,
padj=0.01,lfc=0.5,
plot=TRUE,verbose=TRUE,
export=TRUE,default_dir=here("data/analysis/DE"),
default_prefix="DE-",
labels=colnames(dds),
sample_sel=1:ncol(dds),
expression_cutoff=0,
debug=FALSE,filter=c("median",NULL),...){
# get the filter
if(!is.null(match.arg(filter))){
filter <- rowMedians(counts(dds,normalized=TRUE))
message("Using the median normalized counts as default, set filter=NULL to revert to using the mean")
}
# validation
if(length(contrast)==1){
res <- results(dds,name=contrast,filter = filter)
} else {
res <- results(dds,contrast=contrast,filter = filter)
}
stopifnot(length(sample_sel)==ncol(vst))
if(plot){
par(mar=c(5,5,5,5))
volcanoPlot(res)
par(mar=mar)
}
# a look at independent filtering
if(plot){
plot(metadata(res)$filterNumRej,
type="b", ylab="number of rejections",
xlab="quantiles of filter")
lines(metadata(res)$lo.fit, col="red")
abline(v=metadata(res)$filterTheta)
}
if(verbose){
message(sprintf("The independent filtering cutoff is %s, removing %s of the data",
round(metadata(res)$filterThreshold,digits=5),
names(metadata(res)$filterThreshold)))
max.theta <- metadata(res)$filterNumRej[which.max(metadata(res)$filterNumRej$numRej),"theta"]
message(sprintf("The independent filtering maximises for %s %% of the data, corresponding to a base mean expression of %s (library-size normalised read)",
round(max.theta*100,digits=5),
round(quantile(counts(dds,normalized=TRUE),probs=max.theta),digits=5)))
}
if(plot){
qtl.exp=quantile(counts(dds,normalized=TRUE),probs=metadata(res)$filterNumRej$theta)
dat <- data.frame(thetas=metadata(res)$filterNumRej$theta,
qtl.exp=qtl.exp,
number.degs=sapply(lapply(qtl.exp,function(qe){
res$padj <= padj & abs(res$log2FoldChange) >= lfc &
! is.na(res$padj) & res$baseMean >= qe
}),sum))
if(debug){
plot(ggplot(dat,aes(x=thetas,y=qtl.exp)) +
geom_line() + geom_point() +
scale_x_continuous("quantiles of expression") +
scale_y_continuous("base mean expression") +
geom_hline(yintercept=expression_cutoff,
linetype="dotted",col="red"))
p <- ggplot(dat,aes(x=thetas,y=qtl.exp)) +
geom_line() + geom_point() +
scale_x_continuous("quantiles of expression") +
scale_y_log10("base mean expression") +
geom_hline(yintercept=expression_cutoff,
linetype="dotted",col="red")
suppressMessages(suppressWarnings(plot(p)))
plot(ggplot(dat,aes(x=thetas,y=number.degs)) +
geom_line() + geom_point() +
geom_hline(yintercept=dat$number.degs[1],linetype="dashed") +
scale_x_continuous("quantiles of expression") +
scale_y_continuous("Number of DE genes"))
plot(ggplot(dat,aes(x=thetas,y=number.degs[1] - number.degs),aes()) +
geom_line() + geom_point() +
scale_x_continuous("quantiles of expression") +
scale_y_continuous("Cumulative number of DE genes"))
plot(ggplot(data.frame(x=dat$thetas[-1],
y=diff(dat$number.degs[1] - dat$number.degs)),aes(x,y)) +
geom_line() + geom_point() +
scale_x_continuous("quantiles of expression") +
scale_y_continuous("Number of DE genes per interval"))
plot(ggplot(data.frame(x=dat$qtl.exp[-1],
y=diff(dat$number.degs[1] - dat$number.degs)),aes(x,y)) +
geom_line() + geom_point() +
scale_x_continuous("base mean of expression") +
scale_y_continuous("Number of DE genes per interval"))
p <- ggplot(data.frame(x=dat$qtl.exp[-1],
y=diff(dat$number.degs[1] - dat$number.degs)),aes(x,y)) +
geom_line() + geom_point() +
scale_x_log10("base mean of expression") +
scale_y_continuous("Number of DE genes per interval") +
geom_vline(xintercept=expression_cutoff,
linetype="dotted",col="red")
suppressMessages(suppressWarnings(plot(p)))
}
}
sel <- res$padj <= padj & abs(res$log2FoldChange) >= lfc & ! is.na(res$padj) &
res$baseMean >= expression_cutoff
if(verbose){
message(sprintf(paste(
ifelse(sum(sel)==1,
"There is %s gene that is DE",
"There are %s genes that are DE"),
"with the following parameters: FDR <= %s, |log2FC| >= %s, base mean expression > %s"),
sum(sel),padj,
lfc,expression_cutoff))
}
# proceed only if there are DE genes
if(sum(sel) > 0){
val <- rowSums(vst[sel,sample_sel,drop=FALSE])==0
if (sum(val) >0){
warning(sprintf(paste(
ifelse(sum(val)==1,
"There is %s DE gene that has",
"There are %s DE genes that have"),
"no vst expression in the selected samples"),sum(val)))
sel[sel][val] <- FALSE
}
if(export){
if(!dir.exists(default_dir)){
dir.create(default_dir,showWarnings=FALSE,recursive=TRUE,mode="0771")
}
write.csv(res,file=file.path(default_dir,paste0(default_prefix,"results.csv")))
write.csv(res[sel,],file.path(default_dir,paste0(default_prefix,"genes.csv")))
}
if(plot & sum(sel)>1){
heatmap.2(t(scale(t(vst[sel,sample_sel]))),
distfun = pearson.dist,
hclustfun = function(X){hclust(X,method="ward.D2")},
trace="none",col=hpal,labRow = FALSE,
labCol=labels[sample_sel],...
)
}
}
return(list(all=rownames(res[sel,]),
up=rownames(res[sel & res$log2FoldChange > 0,]),
dn=rownames(res[sel & res$log2FoldChange < 0,])))
}
extractEnrichmentResults <- function(enrichment,task="go",
diff.exp=c("all","up","dn"),
go.namespace=c("BP","CC","MF"),
genes=NULL,export=TRUE,plot=TRUE,
default_dir=here("data/analysis/DE"),
default_prefix="DE",
url="athaliana"){
# process args
diff.exp <- match.arg(diff.exp)
de <- ifelse(diff.exp=="all","none",
ifelse(diff.exp=="dn","down",diff.exp))
# sanity
if( is.null(enrichment[[task]]) | length(enrichment[[task]]) == 0){
message(paste("No enrichment for",task))
} else {
# write out
if(export){
write_tsv(enrichment[[task]],
file=here(default_dir,
paste0(default_prefix,"-genes_GO-enrichment.tsv")))
if(!is.null(genes)){
write_tsv(
enrichedTermToGenes(genes=genes,terms=enrichment[[task]]$id,url=url,mc.cores=16L),
file=here(default_dir,
paste0(default_prefix,"-enriched-term-to-genes.tsv"))
)
}
}
if(plot){
sapply(go.namespace,function(ns){
titles <- c(BP="Biological Process",
CC="Cellular Component",
MF="Molecular Function")
suppressWarnings(tryCatch({plotEnrichedTreemap(enrichment,enrichment=task,
namespace=ns,
de=de,title=paste(default_prefix,titles[ns]))},
error = function(e) {
message(paste("Treemap plot failed for",ns,
"because of:",e))
return(NULL)
}))
})
}
}
}
getBgGenes <- function(dds,threads=1L){
ifilt <- mclapply(resultsNames(dds),
function(nam,dds){
r<-results(dds,name=nam)
return(rownames(r)[!is.na(r$padj)])
},dds,mc.cores=threads)
names(ifilt) <- resultsNames(dds)
nobg <- Reduce(union,ifilt)
upset(fromList(lapply(ifilt,function(i){which(nobg %in% i)})))
return(nobg)
}
load(here("data/analysis/salmon/dds-sample-swap-corrected.rda"))
dds$Response <- factor(levels=c("control","acute","early","late"),
sub(".*hrs","late",
gsub("12hrs|^4hrs","early",
sub("60min","acute",
sub("std","control",dds$Time)))))
vsd <- varianceStabilizingTransformation(dds,blind=FALSE)
## using 'avgTxLength' from assays(dds), correcting for library size
## -- note: fitType='parametric', but the dispersion trend was not well captured by the
## function: y = a/x + b, and a local regression fit was automatically substituted.
## specify fitType='local' or 'mean' to avoid this message next time.
vst <- assay(vsd)
vst <- vst - min(vst)
dir.create(here("data/analysis/DE"),showWarnings=FALSE)
save(vst,file=here("data/analysis/DE/vst-aware.rda"))
write_delim(as.data.frame(vst) %>% rownames_to_column("ID"),
here("data/analysis/DE/vst-aware.tsv"))
#goi <- read_lines(here("doc/goi.txt"))
#stopifnot(all(goi %in% rownames(vst)))
#dev.null <- lapply(goi,line_plot,dds=dds,vst=vst)
design(dds) <- ~Response
dds <- DESeq(dds)
## estimating size factors
## using 'avgTxLength' from assays(dds), correcting for library size
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## -- note: fitType='parametric', but the dispersion trend was not well captured by the
## function: y = a/x + b, and a local regression fit was automatically substituted.
## specify fitType='local' or 'mean' to avoid this message next time.
## final dispersion estimates
## fitting model and testing
## -- replacing outliers and refitting for 1358 genes
## -- DESeq argument 'minReplicatesForReplace' = 7
## -- original counts are preserved in counts(dds)
## estimating dispersions
## fitting model and testing
plotDispEsts(dds)
Check the different contrasts
resultsNames(dds)
## [1] "Intercept" "Response_acute_vs_control"
## [3] "Response_early_vs_control" "Response_late_vs_control"
dir.create(here("data/analysis/DE/Response"))
## Warning in dir.create(here("data/analysis/DE/Response")):
## '/mnt/picea/home/delhomme/Git/algalAcclimatization/data/analysis/DE/Response'
## already exists
acute_vs_control <- extract_results(dds,vst,"Response_acute_vs_control",
default_dir=here("data/analysis/DE/Response"),
default_prefix="acute_vs_control_",
sample_sel=dds$Response %in% c("acute","control"),
labels=dds$Response)
## Using the median normalized counts as default, set filter=NULL to revert to using the mean
## Loading required package: LSD
## The independent filtering cutoff is 0.2292, removing 28.34246% of the data
## The independent filtering maximises for 28.34246 % of the data, corresponding to a base mean expression of 0 (library-size normalised read)
## There are 8336 genes that are DE with the following parameters: FDR <= 0.01, |log2FC| >= 0.5, base mean expression > 0
## Warning in extract_results(dds, vst, "Response_acute_vs_control", default_dir =
## here("data/analysis/DE/Response"), : There are 6 DE genes that have no vst
## expression in the selected samples
early_vs_control <- extract_results(dds,vst,"Response_early_vs_control",
default_dir=here("data/analysis/DE/Response"),
default_prefix="early_vs_control_",
sample_sel=dds$Response %in% c("early","control"),
labels=dds$Response)
## Using the median normalized counts as default, set filter=NULL to revert to using the mean
## The independent filtering cutoff is 0.2292, removing 28.34246% of the data
## The independent filtering maximises for 28.34246 % of the data, corresponding to a base mean expression of 0 (library-size normalised read)
## There are 13498 genes that are DE with the following parameters: FDR <= 0.01, |log2FC| >= 0.5, base mean expression > 0
late_vs_control <- extract_results(dds,vst,"Response_late_vs_control",
default_dir=here("data/analysis/DE/Response"),
default_prefix="late_vs_control_",
sample_sel=dds$Response %in% c("late","control"),
labels=dds$Response)
## Using the median normalized counts as default, set filter=NULL to revert to using the mean
## The independent filtering cutoff is 0.2292, removing 28.34246% of the data
## The independent filtering maximises for 28.34246 % of the data, corresponding to a base mean expression of 0 (library-size normalised read)
## There are 8716 genes that are DE with the following parameters: FDR <= 0.01, |log2FC| >= 0.5, base mean expression > 0
res.list <- list(acute=acute_vs_control,
early=early_vs_control,
late=late_vs_control)
grid.newpage()
grid.draw(venn.diagram(lapply(res.list,"[[","all"),
NULL,
fill=pal[1:3]))
grid.newpage()
grid.draw(venn.diagram(lapply(res.list,"[[","up"),
NULL,
fill=pal[1:3]))
grid.newpage()
grid.draw(venn.diagram(lapply(res.list,"[[","dn"),
NULL,
fill=pal[1:3]))
background <- getBgGenes(dds,threads=length(resultsNames(dds)))
enr.list <- lapply(res.list,function(r){
lapply(r,gopher,background=background,task="go",url="algae")
})
dev.null <- lapply(names(enr.list),function(n){
lapply(names(enr.list[[n]]),function(de){
extractEnrichmentResults(enr.list[[n]][[de]],
diff.exp=de,
genes=res.list[[n]][[de]],
default_prefix=paste(n,de,sep="-"),
url="algae")
})
})
## Loading required package: treemap
dev.null <- lapply(list.files(here("data/analysis/DE"),pattern="*-genes_GO-enrichment.tsv",full.names=TRUE),
function(fil){
write_tsv(read_tsv(fil,col_select=c("id","padj"),
show_col_types=FALSE),
here(sub("\\.tsv","_for-REVIGO.tsv",fil)),
col_names=FALSE)
})
## R version 4.3.1 (2023-06-16)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 22.04.3 LTS
##
## Matrix products: default
## BLAS/LAPACK: /usr/lib/x86_64-linux-gnu/openblas-pthread/libopenblasp-r0.3.20.so; LAPACK version 3.10.0
##
## locale:
## [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8
## [5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8
## [7] LC_PAPER=en_US.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
##
## time zone: Etc/UTC
## tzcode source: system (glibc)
##
## attached base packages:
## [1] parallel grid stats4 stats graphics grDevices utils
## [8] datasets methods base
##
## other attached packages:
## [1] treemap_2.4-4 GO.db_3.18.0
## [3] AnnotationDbi_1.64.1 jsonlite_1.8.7
## [5] LSD_4.1-0 limma_3.58.1
## [7] VennDiagram_1.7.3 futile.logger_1.4.3
## [9] UpSetR_1.4.0 lubridate_1.9.3
## [11] forcats_1.0.0 stringr_1.5.0
## [13] dplyr_1.1.3 purrr_1.0.2
## [15] readr_2.1.4 tidyr_1.3.0
## [17] tibble_3.2.1 tidyverse_2.0.0
## [19] RColorBrewer_1.1-3 hyperSpec_0.100.0
## [21] xml2_1.3.5 ggplot2_3.4.4
## [23] lattice_0.21-8 here_1.0.1
## [25] gplots_3.1.3 DESeq2_1.42.0
## [27] SummarizedExperiment_1.32.0 Biobase_2.62.0
## [29] MatrixGenerics_1.14.0 matrixStats_1.1.0
## [31] GenomicRanges_1.54.1 GenomeInfoDb_1.38.0
## [33] IRanges_2.36.0 S4Vectors_0.40.1
## [35] BiocGenerics_0.48.1 data.table_1.14.8
##
## loaded via a namespace (and not attached):
## [1] rstudioapi_0.15.0 magrittr_2.0.3 farver_2.1.1
## [4] rmarkdown_2.25 zlibbioc_1.48.0 vctrs_0.6.4
## [7] memoise_2.0.1 RCurl_1.98-1.13 htmltools_0.5.7
## [10] S4Arrays_1.2.0 lambda.r_1.2.4 curl_5.1.0
## [13] SparseArray_1.2.1 sass_0.4.7 KernSmooth_2.23-22
## [16] bslib_0.5.1 plyr_1.8.9 testthat_3.2.0
## [19] futile.options_1.0.1 cachem_1.0.8 igraph_1.5.1
## [22] mime_0.12 lifecycle_1.0.4 pkgconfig_2.0.3
## [25] Matrix_1.6-0 R6_2.5.1 fastmap_1.1.1
## [28] GenomeInfoDbData_1.2.11 shiny_1.7.5.1 digest_0.6.33
## [31] colorspace_2.1-0 rprojroot_2.0.4 RSQLite_2.3.2
## [34] labeling_0.4.3 fansi_1.0.5 timechange_0.2.0
## [37] httr_1.4.7 abind_1.4-5 compiler_4.3.1
## [40] bit64_4.0.5 withr_2.5.2 BiocParallel_1.36.0
## [43] DBI_1.1.3 highr_0.10 DelayedArray_0.28.0
## [46] gtools_3.9.4 caTools_1.18.2 tools_4.3.1
## [49] httpuv_1.6.12 glue_1.6.2 promises_1.2.1
## [52] gridBase_0.4-7 generics_0.1.3 gtable_0.3.4
## [55] tzdb_0.4.0 hms_1.1.3 utf8_1.2.4
## [58] XVector_0.42.0 pillar_1.9.0 vroom_1.6.4
## [61] later_1.3.1 bit_4.0.5 deldir_1.0-9
## [64] tidyselect_1.2.0 locfit_1.5-9.8 Biostrings_2.70.1
## [67] knitr_1.45 gridExtra_2.3 xfun_0.41
## [70] statmod_1.5.0 brio_1.1.3 stringi_1.7.12
## [73] lazyeval_0.2.2 yaml_2.3.7 evaluate_0.23
## [76] codetools_0.2-19 interp_1.1-4 cli_3.6.1
## [79] xtable_1.8-4 munsell_0.5.0 jquerylib_0.1.4
## [82] Rcpp_1.0.11 png_0.1-8 ellipsis_0.3.2
## [85] blob_1.2.4 latticeExtra_0.6-30 jpeg_0.1-10
## [88] bitops_1.0-7 scales_1.2.1 crayon_1.5.2
## [91] rlang_1.1.2 KEGGREST_1.42.0 formatR_1.14